package crawl;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 递归查找所有链接
 */
public class UrlPool {
	public static void main(String[] args) {
		// getUrl("https://ffyx.vip/");
		getUrl("http://yhdm80.com/");
	}

	private static void getUrl(String baseUrl) {
		Map<String, Boolean> oldMap = new LinkedHashMap<>();
		String oldLinkHost = "";
		Pattern p = Pattern.compile("(https?://)?[^/\\s]*");
		Matcher m = p.matcher(baseUrl);
		if (m.find()) {
			oldLinkHost = m.group();
		}
		oldMap.put(baseUrl, false);
		crawLinks(oldLinkHost, oldMap);
		for (String url : oldMap.keySet()) {
			System.out.println("链接：" + url);
		}

	}

	private static Map<String, Boolean> crawLinks(String oldLinkHost, Map<String, Boolean> oldMap) {
		Map<String, Boolean> newMap = new LinkedHashMap<>();
		for (Map.Entry<String, Boolean> entry : oldMap.entrySet()) {
			if (entry.getValue()) continue;
			String oldLink = entry.getKey();
			try {
				URL url = new URL(oldLink);
				HttpURLConnection connection = (HttpURLConnection) url.openConnection();
				connection.setRequestMethod("GET");

				// System.out.println("状态码：" + connection.getResponseCode() + "，访问：" + oldLink);
				if (connection.getResponseCode() == 200) {
					try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
						Pattern p = Pattern.compile("<a.*?href=[\"']?((https?://)?/?[^\"']+)[\"']?.*?>(.+)</a>");
						Matcher matcher;
						String line;
						while ((line = reader.readLine()) != null) {
							matcher = p.matcher(line);
							if (matcher.find()) {
								String newLink = matcher.group(1).trim();
								if (!newLink.startsWith("http")) {
									newLink = oldLinkHost + (newLink.startsWith("/") ? "" : "/") + newLink;
								}
								// 有些链接去了反而访问失败
								// if (newLink.endsWith("/")) {
								// 	newLink = newLink.substring(0, newLink.length() - 1);
								// }

								// 去重
								if (!oldMap.containsKey(newLink) && !newMap.containsKey(newLink) && newLink.startsWith(oldLinkHost)) {
									System.out.println("添加：" + newLink);
									newMap.put(newLink, false);
								}
							}

						}
					}
				}
			} catch (IOException e) {
				System.err.println(oldLink + " 访问失败");
				System.err.println("失败原因："+e.getMessage());
			}
			// 标记已经遍历过了
			oldMap.replace(oldLink, false);
		}

		if (!newMap.isEmpty()) {
			oldMap.putAll(newMap);
			oldMap.putAll(crawLinks(oldLinkHost, oldMap));
		}

		return oldMap;
	}
}
